import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
df = pd.read_csv("hmeq.csv")
df.head()
| BAD | LOAN | MORTDUE | VALUE | REASON | JOB | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1100 | 25860.0 | 39025.0 | HomeImp | Other | 10.5 | 0.0 | 0.0 | 94.366667 | 1.0 | 9.0 | NaN |
| 1 | 1 | 1300 | 70053.0 | 68400.0 | HomeImp | Other | 7.0 | 0.0 | 2.0 | 121.833333 | 0.0 | 14.0 | NaN |
| 2 | 1 | 1500 | 13500.0 | 16700.0 | HomeImp | Other | 4.0 | 0.0 | 0.0 | 149.466667 | 1.0 | 10.0 | NaN |
| 3 | 1 | 1500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | 1700 | 97800.0 | 112000.0 | HomeImp | Office | 3.0 | 0.0 | 0.0 | 93.333333 | 0.0 | 14.0 | NaN |
Data Preprocessing
df.describe()
| BAD | LOAN | MORTDUE | VALUE | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5960.000000 | 5960.000000 | 5442.000000 | 5848.000000 | 5445.000000 | 5252.000000 | 5380.000000 | 5652.000000 | 5450.000000 | 5738.000000 | 4693.000000 |
| mean | 0.199497 | 18607.969799 | 73760.817200 | 101776.048741 | 8.922268 | 0.254570 | 0.449442 | 179.766275 | 1.186055 | 21.296096 | 33.779915 |
| std | 0.399656 | 11207.480417 | 44457.609458 | 57385.775334 | 7.573982 | 0.846047 | 1.127266 | 85.810092 | 1.728675 | 10.138933 | 8.601746 |
| min | 0.000000 | 1100.000000 | 2063.000000 | 8000.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.524499 |
| 25% | 0.000000 | 11100.000000 | 46276.000000 | 66075.500000 | 3.000000 | 0.000000 | 0.000000 | 115.116702 | 0.000000 | 15.000000 | 29.140031 |
| 50% | 0.000000 | 16300.000000 | 65019.000000 | 89235.500000 | 7.000000 | 0.000000 | 0.000000 | 173.466667 | 1.000000 | 20.000000 | 34.818262 |
| 75% | 0.000000 | 23300.000000 | 91488.000000 | 119824.250000 | 13.000000 | 0.000000 | 0.000000 | 231.562278 | 2.000000 | 26.000000 | 39.003141 |
| max | 1.000000 | 89900.000000 | 399550.000000 | 855909.000000 | 41.000000 | 10.000000 | 15.000000 | 1168.233561 | 17.000000 | 71.000000 | 203.312149 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5960 entries, 0 to 5959 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BAD 5960 non-null int64 1 LOAN 5960 non-null int64 2 MORTDUE 5442 non-null float64 3 VALUE 5848 non-null float64 4 REASON 5708 non-null object 5 JOB 5681 non-null object 6 YOJ 5445 non-null float64 7 DEROG 5252 non-null float64 8 DELINQ 5380 non-null float64 9 CLAGE 5652 non-null float64 10 NINQ 5450 non-null float64 11 CLNO 5738 non-null float64 12 DEBTINC 4693 non-null float64 dtypes: float64(9), int64(2), object(2) memory usage: 605.4+ KB
df.isnull().sum()
BAD 0 LOAN 0 MORTDUE 518 VALUE 112 REASON 252 JOB 279 YOJ 515 DEROG 708 DELINQ 580 CLAGE 308 NINQ 510 CLNO 222 DEBTINC 1267 dtype: int64
for col in df.select_dtypes(include='float64').columns:
print(f"\nColumn: {col}")
print(df[col].value_counts(dropna=False))
Column: MORTDUE
MORTDUE
NaN 518
42000.0 11
47000.0 10
65000.0 9
124000.0 7
...
65372.0 1
15346.0 1
58549.0 1
69195.0 1
48811.0 1
Name: count, Length: 5054, dtype: int64
Column: VALUE
VALUE
NaN 112
60000.0 15
80000.0 14
85000.0 12
65000.0 11
...
116994.0 1
42682.0 1
72175.0 1
70095.0 1
88934.0 1
Name: count, Length: 5382, dtype: int64
Column: YOJ
YOJ
NaN 515
0.00 415
1.00 363
2.00 347
5.00 333
...
29.90 1
12.90 1
13.50 1
0.25 1
8.30 1
Name: count, Length: 100, dtype: int64
Column: DEROG
DEROG
0.0 4527
NaN 708
1.0 435
2.0 160
3.0 58
4.0 23
5.0 15
6.0 15
7.0 8
8.0 6
9.0 3
10.0 2
Name: count, dtype: int64
Column: DELINQ
DELINQ
0.0 4179
1.0 654
NaN 580
2.0 250
3.0 129
4.0 78
5.0 38
6.0 27
7.0 13
8.0 5
10.0 2
11.0 2
15.0 1
12.0 1
13.0 1
Name: count, dtype: int64
Column: CLAGE
CLAGE
NaN 308
102.500000 7
206.966667 7
177.500000 6
123.766667 6
...
240.856017 1
196.241371 1
71.461705 1
184.880011 1
219.601002 1
Name: count, Length: 5315, dtype: int64
Column: NINQ
NINQ
0.0 2531
1.0 1339
2.0 780
NaN 510
3.0 392
4.0 156
5.0 75
6.0 56
7.0 44
10.0 28
8.0 22
9.0 11
11.0 10
12.0 2
13.0 2
14.0 1
17.0 1
Name: count, dtype: int64
Column: CLNO
CLNO
16.0 316
19.0 307
24.0 264
23.0 259
21.0 235
...
58.0 3
71.0 2
53.0 2
57.0 1
63.0 1
Name: count, Length: 63, dtype: int64
Column: DEBTINC
DEBTINC
NaN 1267
34.964141 1
41.576701 1
41.395462 1
20.688715 1
...
39.244669 1
40.943866 1
30.444839 1
36.158718 1
34.571519 1
Name: count, Length: 4694, dtype: int64
#DAre people with missing mortgage data more likely to default?
df.groupby(df['MORTDUE'].isna())['BAD'].mean()
MORTDUE False 0.199008 True 0.204633 Name: BAD, dtype: float64
evaluating missing values in all numerical features and compared default rates between missing and non-missing observations.
num_cols = df.select_dtypes(include=['float64']).columns
for col in num_cols:
if df[col].isna().sum() > 0:
print("="*50)
print(f"Column: {col}")
print("Missing %:", round(df[col].isna().mean()*100, 2))
print(df.groupby(df[col].isna())['BAD'].mean())
================================================== Column: MORTDUE Missing %: 8.69 MORTDUE False 0.199008 True 0.204633 Name: BAD, dtype: float64 ================================================== Column: VALUE Missing %: 1.88 VALUE False 0.185363 True 0.937500 Name: BAD, dtype: float64 ================================================== Column: YOJ Missing %: 8.64 YOJ False 0.206428 True 0.126214 Name: BAD, dtype: float64 ================================================== Column: DEROG Missing %: 11.88 DEROG False 0.209825 True 0.122881 Name: BAD, dtype: float64 ================================================== Column: DELINQ Missing %: 9.73 DELINQ False 0.207621 True 0.124138 Name: BAD, dtype: float64 ================================================== Column: CLAGE Missing %: 5.17 CLAGE False 0.196568 True 0.253247 Name: BAD, dtype: float64 ================================================== Column: NINQ Missing %: 8.56 NINQ False 0.204404 True 0.147059 Name: BAD, dtype: float64 ================================================== Column: CLNO Missing %: 3.72 CLNO False 0.197978 True 0.238739 Name: BAD, dtype: float64 ================================================== Column: DEBTINC Missing %: 21.26 DEBTINC False 0.085873 True 0.620363 Name: BAD, dtype: float64
Performing MICE ( Multiple Imputation by Chained Equations ) technique to impute numerical missing values
df.head()
| BAD | LOAN | MORTDUE | VALUE | REASON | JOB | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1100 | 25860.0 | 39025.0 | HomeImp | Other | 10.5 | 0.0 | 0.0 | 94.366667 | 1.0 | 9.0 | NaN |
| 1 | 1 | 1300 | 70053.0 | 68400.0 | HomeImp | Other | 7.0 | 0.0 | 2.0 | 121.833333 | 0.0 | 14.0 | NaN |
| 2 | 1 | 1500 | 13500.0 | 16700.0 | HomeImp | Other | 4.0 | 0.0 | 0.0 | 149.466667 | 1.0 | 10.0 | NaN |
| 3 | 1 | 1500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 0 | 1700 | 97800.0 | 112000.0 | HomeImp | Office | 3.0 | 0.0 | 0.0 | 93.333333 | 0.0 | 14.0 | NaN |
Checking the duplicates
df.duplicated().sum()
0
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Select numeric float columns
num_cols = df.select_dtypes(include=['float64']).columns
# Initialize imputer
imputer = IterativeImputer(random_state=42)
# Impute and round to 2 decimals
df[num_cols] = (
pd.DataFrame(
imputer.fit_transform(df[num_cols]),
columns=num_cols,
index=df.index
).round(2)
)
df=pd.DataFrame(df)
df.isnull().sum()
BAD 0 LOAN 0 MORTDUE 0 VALUE 0 REASON 252 JOB 279 YOJ 0 DEROG 0 DELINQ 0 CLAGE 0 NINQ 0 CLNO 0 DEBTINC 0 dtype: int64
df.head()
| BAD | LOAN | MORTDUE | VALUE | REASON | JOB | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1100 | 25860.00 | 39025.00 | HomeImp | Other | 10.50 | 0.00 | 0.00 | 94.37 | 1.00 | 9.00 | 31.77 |
| 1 | 1 | 1300 | 70053.00 | 68400.00 | HomeImp | Other | 7.00 | 0.00 | 2.00 | 121.83 | 0.00 | 14.00 | 33.58 |
| 2 | 1 | 1500 | 13500.00 | 16700.00 | HomeImp | Other | 4.00 | 0.00 | 0.00 | 149.47 | 1.00 | 10.00 | 31.46 |
| 3 | 1 | 1500 | 70988.55 | 101779.25 | NaN | NaN | 8.99 | 0.27 | 0.45 | 178.90 | 1.19 | 21.25 | 33.88 |
| 4 | 0 | 1700 | 97800.00 | 112000.00 | HomeImp | Office | 3.00 | 0.00 | 0.00 | 93.33 | 0.00 | 14.00 | 33.95 |
Checking the distribution and filling the null values of categorical features with mode
# Import necessary libraries
import matplotlib.pyplot as plt
import pandas as pd
# Plot 1: Distribution of REASON with respect to BAD
reason_bad = pd.crosstab(df['REASON'], df['BAD'])
plt.figure()
reason_bad.plot(kind='bar')
plt.title("Distribution of BAD with Respect to REASON")
plt.xlabel("REASON")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()
# Plot 2: Distribution of JOB with respect to BAD
job_bad = pd.crosstab(df['JOB'], df['BAD'])
plt.figure()
job_bad.plot(kind='bar')
plt.title("Distribution of BAD with Respect to JOB")
plt.xlabel("JOB")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
df.groupby(df['REASON'].isna())['BAD'].mean()
REASON False 0.199895 True 0.190476 Name: BAD, dtype: float64
df.groupby(df['JOB'].isna())['BAD'].mean()
JOB False 0.205246 True 0.082437 Name: BAD, dtype: float64
#Performing simple imputation: filling the missing values with most frequent values
from sklearn.impute import SimpleImputer
# Select categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
# Initialize imputer
cat_imputer = SimpleImputer(strategy='most_frequent')
# Impute and keep structure
df[cat_cols] = pd.DataFrame(
cat_imputer.fit_transform(df[cat_cols]),
columns=cat_cols,
index=df.index
)
df.isna().sum()
BAD 0 LOAN 0 MORTDUE 0 VALUE 0 REASON 0 JOB 0 YOJ 0 DEROG 0 DELINQ 0 CLAGE 0 NINQ 0 CLNO 0 DEBTINC 0 dtype: int64
EDA
for col in cat_cols:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x=col, hue='BAD')
plt.title(f'{col} vs BAD')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"categorical_plots/{col}_vs_BAD.png", dpi=300)
plt.show()
plt.close()
insights from categorical variables
import matplotlib.pyplot as plt
import seaborn as sns
pivot_table = df.pivot_table(
values='BAD',
index='JOB',
columns='REASON',
aggfunc='mean'
)
plt.figure(figsize=(8,6))
sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="Reds")
plt.title("Default Rate Heatmap (JOB vs REASON)")
# Save directly into existing folder
plt.savefig("categorical_plots/default_rate_heatmap_JOB_vs_REASON.png",
dpi=300, bbox_inches='tight')
plt.show()
plt.close()
df['JOB'].value_counts()
JOB Other 2667 ProfExe 1276 Office 948 Mgr 767 Self 193 Sales 109 Name: count, dtype: int64
import matplotlib.pyplot as plt
import pandas as pd
pd.crosstab(
[df['JOB'], df['REASON']],
df['BAD'],
normalize='index'
).plot(kind='bar', stacked=True, figsize=(10,6))
plt.title("Proportion of BAD within JOB & REASON")
plt.ylabel("Proportion")
# Save the plot
plt.savefig("categorical_plots/proportion_BAD_JOB_REASON.png",
dpi=300, bbox_inches='tight')
plt.show()
plt.close()
import os
import matplotlib.pyplot as plt
import seaborn as sns
# Create folder
os.makedirs("numerical_plots", exist_ok=True)
# Identify numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Remove target column
if 'BAD' in num_cols:
num_cols.remove('BAD')
for col in num_cols:
plt.figure(figsize=(6,4))
sns.violinplot(data=df, x='BAD', y=col, inner='quartile')
plt.title(f'{col} distribution by BAD')
plt.tight_layout()
# Save image
plt.savefig(f"numerical_plots/{col}_vs_BAD.png", dpi=300)
# Show image
plt.show()
# Close figure
plt.close()
import seaborn as sns
import matplotlib.pyplot as plt
# Select numeric columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Remove target from features
if 'BAD' in num_cols:
num_cols.remove('BAD')
# Define a custom palette for clarity
palette = {0: "#1f77b4", 1: "#ff7f0e"} # Blue for 0, Orange for 1
# Pairplot with improved colors
sns.pairplot(
df[num_cols + ['BAD']],
hue='BAD',
diag_kind='kde',
palette=palette,
markers=["o", "s"], # Circle for 0, Square for 1
plot_kws={'alpha': 0.6}
)
plt.show()
import pandas as pd
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
# Compute correlation matrix
corr_matrix = df[num_cols].corr()
print(corr_matrix)
BAD LOAN MORTDUE VALUE YOJ DEROG DELINQ \
BAD 1.000000 -0.075099 -0.024085 -0.028126 -0.061991 0.263514 0.339712
LOAN -0.075099 1.000000 0.228569 0.331892 0.104063 -0.001596 -0.025774
MORTDUE -0.024085 0.228569 1.000000 0.896879 -0.078326 -0.047213 0.003688
VALUE -0.028126 0.331892 0.896879 1.000000 0.000147 -0.048287 -0.006018
YOJ -0.061991 0.104063 -0.078326 0.000147 1.000000 -0.069331 0.032649
DEROG 0.263514 -0.001596 -0.047213 -0.048287 -0.069331 1.000000 0.238561
DELINQ 0.339712 -0.025774 0.003688 -0.006018 0.032649 0.238561 1.000000
CLAGE -0.165805 0.086985 0.127219 0.176783 0.217904 -0.080652 0.026205
NINQ 0.170447 0.049260 0.032215 -0.006780 -0.082396 0.195430 0.076364
CLNO -0.003031 0.076244 0.350993 0.271876 0.022202 0.067706 0.168269
DEBTINC 0.146887 0.084961 0.207379 0.146039 -0.061950 0.033934 0.069509
CLAGE NINQ CLNO DEBTINC
BAD -0.165805 0.170447 -0.003031 0.146887
LOAN 0.086985 0.049260 0.076244 0.084961
MORTDUE 0.127219 0.032215 0.350993 0.207379
VALUE 0.176783 -0.006780 0.271876 0.146039
YOJ 0.217904 -0.082396 0.022202 -0.061950
DEROG -0.080652 0.195430 0.067706 0.033934
DELINQ 0.026205 0.076364 0.168269 0.069509
CLAGE 1.000000 -0.121013 0.247371 -0.041768
NINQ -0.121013 1.000000 0.090880 0.169463
CLNO 0.247371 0.090880 1.000000 0.212974
DEBTINC -0.041768 0.169463 0.212974 1.000000
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix,
annot=True,
cmap='coolwarm',
fmt='.2f',
linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
# Folder to save boxplots
save_dir = "boxplots"
os.makedirs(save_dir, exist_ok=True)
# Number of plots per row
n_cols = 3
n_rows = math.ceil(len(num_cols) / n_cols)
# Set figure size
plt.figure(figsize=(5*n_cols, 5*n_rows))
# Loop through numerical columns
for i, col in enumerate(num_cols, 1):
plt.subplot(n_rows, n_cols, i)
sns.boxplot(y=df[col], color='lightblue')
plt.title(col)
plt.tight_layout()
# Save individual boxplot
plt.savefig(os.path.join(save_dir, f"{col}_boxplot.png"))
plt.show()
Splitting the data in train test split and then will perform feature engineering to prevent data leakage
import pandas as pd
from sklearn.model_selection import train_test_split
target_col = 'BAD'
# Features and target
X = df.drop(columns=[target_col])
y = df[target_col]
# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Training target distribution:\n", y_train.value_counts(normalize=True))
Training features shape: (4768, 12) Test features shape: (1192, 12) Training target distribution: BAD 0 0.800545 1 0.199455 Name: proportion, dtype: float64
Apply DBSCAN based outlier detection and removal technique
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
# Copy training data
X_train_dbscan = X_train.copy()
y_train_dbscan = y_train.copy()
# Select numerical columns
num_cols = X_train_dbscan.select_dtypes(include=['int64', 'float64']).columns
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_dbscan[num_cols])
# Apply DBSCAN
dbscan = DBSCAN(eps=2, min_samples=5)
labels = dbscan.fit_predict(X_scaled)
# -1 label means outlier
outliers = labels == -1
print("Number of detected outliers:", outliers.sum())
# Keep only those values which don't have -1 as labels
X_train_clean = X_train_dbscan[labels != -1]
y_train_clean = y_train_dbscan[labels != -1]
print("Shape of cleaned training features:", X_train_clean.shape)
print("Shape of cleaned training target:", y_train_clean.shape)
Number of detected outliers: 192 Shape of cleaned training features: (4576, 12) Shape of cleaned training target: (4576,)
X_train_clean.head()
| LOAN | MORTDUE | VALUE | REASON | JOB | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 609 | 7700 | 70451.00 | 81862.0 | DebtCon | Sales | 3.0 | 0.00 | 0.00 | 141.27 | 0.00 | 31.0 | 31.68 |
| 4015 | 21000 | 48735.00 | 71694.0 | DebtCon | Other | 8.0 | 0.00 | 0.00 | 48.50 | 3.00 | 10.0 | 32.92 |
| 1591 | 11500 | 63136.00 | 81099.0 | DebtCon | Other | 3.0 | 0.46 | 0.69 | 149.06 | 1.42 | 35.0 | 28.98 |
| 1127 | 9900 | 55342.00 | 72357.0 | DebtCon | Mgr | 7.0 | 0.66 | 3.00 | 112.00 | 1.00 | 11.0 | 39.87 |
| 920 | 9000 | 47350.86 | 105000.0 | DebtCon | ProfExe | 6.0 | 0.00 | 1.00 | 227.27 | 0.00 | 10.0 | 30.62 |
X_train_clean.isnull().sum()
LOAN 0 MORTDUE 0 VALUE 0 REASON 0 JOB 0 YOJ 0 DEROG 0 DELINQ 0 CLAGE 0 NINQ 0 CLNO 0 DEBTINC 0 dtype: int64
Taking boxplot of numerical values after removing missing values to check how much data have been lost
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
# Numerical columns in cleaned training data
num_cols = X_train_clean.select_dtypes(include=['int64', 'float64']).columns
# Folder to save boxplots
save_dir = "boxplots_dbscan"
os.makedirs(save_dir, exist_ok=True)
# Number of plots per row
n_cols = 3
n_rows = math.ceil(len(num_cols) / n_cols)
# Set figure size
plt.figure(figsize=(5*n_cols, 5*n_rows))
# Loop through numerical columns to plot
for i, col in enumerate(num_cols, 1):
plt.subplot(n_rows, n_cols, i)
sns.boxplot(y=X_train_clean[col], color='lightgreen')
plt.title(col)
plt.tight_layout()
# Save individual boxplot
plt.savefig(os.path.join(save_dir, f"{col}_boxplot.png"))
# Show all boxplots together
plt.show()
Checking the number of unique values in categorical variables
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
print(f"Column: {col}")
print(df[col].value_counts())
print("-" * 50)
Column: REASON REASON DebtCon 4180 HomeImp 1780 Name: count, dtype: int64 -------------------------------------------------- Column: JOB JOB Other 2667 ProfExe 1276 Office 948 Mgr 767 Self 193 Sales 109 Name: count, dtype: int64 --------------------------------------------------
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Numerical and categorical columns in cleaned training set
num_cols = X_train_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train_clean.select_dtypes(include=['object', 'category']).columns.tolist()
# Pipeline for numerical features: only standardize
num_pipeline = Pipeline(steps=[
('scaler', StandardScaler())
])
# Pipeline for categorical features: one-hot encode
cat_pipeline = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
('num', num_pipeline, num_cols),
('cat', cat_pipeline, cat_cols)
])
# Fit pipeline on training data
X_train_processed = preprocessor.fit_transform(X_train_clean)
# Transform test data using the same pipeline
X_test_processed = preprocessor.transform(X_test)
print("Training data processed shape:", X_train_processed.shape)
print("Test data processed shape:", X_test_processed.shape)
Training data processed shape: (4576, 18) Test data processed shape: (1192, 18)
import pandas as pd
# For numerical + one-hot encoded categorical columns, we can use dummy names
num_cols = X_train_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train_clean.select_dtypes(include=['object', 'category']).columns.tolist()
# OneHotEncoder expands categorical columns
cat_cols_expanded = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)
all_cols = np.concatenate([num_cols, cat_cols_expanded])
# Convert processed array to DataFrame
X_train_df = pd.DataFrame(X_train_processed, columns=all_cols)
# check for missing values per column
X_train_df.isnull().sum()
LOAN 0 MORTDUE 0 VALUE 0 YOJ 0 DEROG 0 DELINQ 0 CLAGE 0 NINQ 0 CLNO 0 DEBTINC 0 REASON_DebtCon 0 REASON_HomeImp 0 JOB_Mgr 0 JOB_Office 0 JOB_Other 0 JOB_ProfExe 0 JOB_Sales 0 JOB_Self 0 dtype: int64
X_train_df.head()
| LOAN | MORTDUE | VALUE | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | REASON_DebtCon | REASON_HomeImp | JOB_Mgr | JOB_Office | JOB_Other | JOB_ProfExe | JOB_Sales | JOB_Self | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.967113 | 0.016154 | -0.345962 | -0.824932 | -0.369110 | -0.454279 | -0.456708 | -0.737183 | 1.024241 | -0.305673 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 0.246253 | -0.492382 | -0.540519 | -0.134014 | -0.369110 | -0.454279 | -1.590043 | 1.244222 | -1.126358 | -0.114627 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 2 | -0.620437 | -0.155145 | -0.360561 | -0.824932 | 0.477641 | 0.408898 | -0.361541 | 0.200682 | 1.433879 | -0.721661 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 3 | -0.766406 | -0.337662 | -0.527833 | -0.272197 | 0.845793 | 3.298663 | -0.814288 | -0.076714 | -1.023948 | 0.956157 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | -0.848513 | -0.524795 | 0.096766 | -0.410381 | -0.369110 | 0.796702 | 0.593920 | -0.737183 | -1.126358 | -0.468987 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
try:
from xgboost import XGBClassifier
xgb_available = True
except:
xgb_available = False
# Define models
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
"SVM (RBF Kernel)": SVC(kernel='rbf', probability=True, random_state=42),
"KNN": KNeighborsClassifier(n_neighbors=5),
"Naive Bayes": GaussianNB()
}
if xgb_available:
models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# Store training results
train_results = []
for model_name, model in models.items():
# Train model
model.fit(X_train_processed, y_train_clean)
# Predictions
y_train_pred = model.predict(X_train_processed)
# Metrics
train_acc = (y_train_pred == y_train_clean).mean()
class_report = classification_report(y_train_clean, y_train_pred)
cm = confusion_matrix(y_train_clean, y_train_pred)
# Store results
train_results.append({
"Model": model_name,
"Train Accuracy": train_acc,
"Model Object": model
})
# Print metrics
print(f"--- {model_name} Training ---")
print("Training Accuracy:", train_acc)
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", cm)
# Plot confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'{model_name} - Training Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
print("\n")
# Create summary DataFrame
train_results_df = pd.DataFrame(train_results).drop(columns=['Model Object'])
print("Summary of Training Accuracy:")
print(train_results_df)
--- Logistic Regression Training ---
Training Accuracy: 0.8435314685314685
Classification Report:
precision recall f1-score support
0 0.86 0.98 0.91 3814
1 0.60 0.18 0.28 762
accuracy 0.84 4576
macro avg 0.73 0.58 0.60 4576
weighted avg 0.81 0.84 0.81 4576
Confusion Matrix:
[[3721 93]
[ 623 139]]
--- Random Forest Training ---
Training Accuracy: 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 3814
1 1.00 1.00 1.00 762
accuracy 1.00 4576
macro avg 1.00 1.00 1.00 4576
weighted avg 1.00 1.00 1.00 4576
Confusion Matrix:
[[3814 0]
[ 0 762]]
--- Gradient Boosting Training ---
Training Accuracy: 0.930506993006993
Classification Report:
precision recall f1-score support
0 0.93 0.99 0.96 3814
1 0.94 0.62 0.75 762
accuracy 0.93 4576
macro avg 0.93 0.81 0.85 4576
weighted avg 0.93 0.93 0.92 4576
Confusion Matrix:
[[3784 30]
[ 288 474]]
--- SVM (RBF Kernel) Training ---
Training Accuracy: 0.8940122377622378
Classification Report:
precision recall f1-score support
0 0.89 1.00 0.94 3814
1 0.96 0.38 0.54 762
accuracy 0.89 4576
macro avg 0.93 0.69 0.74 4576
weighted avg 0.90 0.89 0.87 4576
Confusion Matrix:
[[3803 11]
[ 474 288]]
--- KNN Training ---
Training Accuracy: 0.927666083916084
Classification Report:
precision recall f1-score support
0 0.92 1.00 0.96 3814
1 0.97 0.58 0.73 762
accuracy 0.93 4576
macro avg 0.95 0.79 0.84 4576
weighted avg 0.93 0.93 0.92 4576
Confusion Matrix:
[[3801 13]
[ 318 444]]
--- Naive Bayes Training ---
Training Accuracy: 0.8122814685314685
Classification Report:
precision recall f1-score support
0 0.87 0.91 0.89 3814
1 0.42 0.35 0.38 762
accuracy 0.81 4576
macro avg 0.65 0.63 0.64 4576
weighted avg 0.80 0.81 0.80 4576
Confusion Matrix:
[[3453 361]
[ 498 264]]
--- XGBoost Training ---
Training Accuracy: 0.9995629370629371
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 3814
1 1.00 1.00 1.00 762
accuracy 1.00 4576
macro avg 1.00 1.00 1.00 4576
weighted avg 1.00 1.00 1.00 4576
Confusion Matrix:
[[3814 0]
[ 2 760]]
Summary of Training Accuracy:
Model Train Accuracy
0 Logistic Regression 0.843531
1 Random Forest 1.000000
2 Gradient Boosting 0.930507
3 SVM (RBF Kernel) 0.894012
4 KNN 0.927666
5 Naive Bayes 0.812281
6 XGBoost 0.999563
train_results_df
| Model | Train Accuracy | |
|---|---|---|
| 0 | Logistic Regression | 0.843531 |
| 1 | Random Forest | 1.000000 |
| 2 | Gradient Boosting | 0.930507 |
| 3 | SVM (RBF Kernel) | 0.894012 |
| 4 | KNN | 0.927666 |
| 5 | Naive Bayes | 0.812281 |
| 6 | XGBoost | 0.999563 |
print(type(X_test_processed))
<class 'numpy.ndarray'>
print("X_test shape:", X_test.shape)
print("X_test_processed shape:", X_test_processed.shape)
X_test shape: (1192, 12) X_test_processed shape: (1192, 18)
import pandas as pd
# Get expanded categorical column names
cat_cols_expanded = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)
# Combine with numeric column names
all_cols = np.concatenate([num_cols, cat_cols_expanded])
# Convert to DataFrame
X_test_df = pd.DataFrame(X_test_processed, columns=all_cols)
X_test_df.head()
| LOAN | MORTDUE | VALUE | YOJ | DEROG | DELINQ | CLAGE | NINQ | CLNO | DEBTINC | REASON_DebtCon | REASON_HomeImp | JOB_Mgr | JOB_Office | JOB_Other | JOB_ProfExe | JOB_Sales | JOB_Self | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.620298 | -0.666164 | -0.546967 | 1.386006 | -0.36911 | -0.454279 | 0.060788 | -0.076714 | 0.102556 | 0.000926 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.118530 | -0.854207 | -0.729374 | 0.695088 | -0.36911 | -0.454279 | -0.723030 | -0.076714 | -0.409492 | 0.634152 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 2 | -0.666052 | -0.354078 | -0.655114 | -0.548565 | -0.36911 | -0.454279 | 1.765737 | -0.737183 | -1.126358 | -0.056080 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 3 | -0.793775 | -0.574226 | -0.761672 | -0.963115 | -0.36911 | 0.796702 | -1.167714 | -0.737183 | -0.204673 | 0.737378 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 4 | -1.304666 | -0.275418 | -0.583647 | -0.755840 | -0.36911 | 0.796702 | -0.121973 | -0.737183 | -0.307082 | -0.094598 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Store testing results
test_results = []
for result in train_results:
model_name = result["Model"]
model = result["Model Object"]
# Predict on test data
y_test_pred = model.predict(X_test_processed)
# Metrics
test_acc = accuracy_score(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)
# Store results (weighted avg metrics)
test_results.append({
"Model": model_name,
"Test Accuracy": test_acc,
"Precision (Weighted)": class_report['weighted avg']['precision'],
"Recall (Weighted)": class_report['weighted avg']['recall'],
"F1-Score (Weighted)": class_report['weighted avg']['f1-score']
})
# Print metrics
print(f"--- {model_name} Testing ---")
print("Test Accuracy:", test_acc)
print("Classification Report:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", cm)
# Plot confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title(f'{model_name} - Test Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
print("\n")
# Summary DataFrame
test_results_df = pd.DataFrame(test_results)
# Sort by accuracy (best model on top)
test_results_df = test_results_df.sort_values(by="Test Accuracy", ascending=False)
print("Summary of Test Performance:")
print(test_results_df)
--- Logistic Regression Testing ---
Test Accuracy: 0.8355704697986577
Classification Report:
precision recall f1-score support
0 0.84 0.97 0.90 954
1 0.73 0.28 0.40 238
accuracy 0.84 1192
macro avg 0.79 0.63 0.65 1192
weighted avg 0.82 0.84 0.80 1192
Confusion Matrix:
[[930 24]
[172 66]]
--- Random Forest Testing ---
Test Accuracy: 0.9068791946308725
Classification Report:
precision recall f1-score support
0 0.90 1.00 0.94 954
1 0.98 0.55 0.70 238
accuracy 0.91 1192
macro avg 0.94 0.77 0.82 1192
weighted avg 0.91 0.91 0.90 1192
Confusion Matrix:
[[951 3]
[108 130]]
--- Gradient Boosting Testing ---
Test Accuracy: 0.8808724832214765
Classification Report:
precision recall f1-score support
0 0.88 0.98 0.93 954
1 0.86 0.48 0.62 238
accuracy 0.88 1192
macro avg 0.87 0.73 0.77 1192
weighted avg 0.88 0.88 0.87 1192
Confusion Matrix:
[[935 19]
[123 115]]
--- SVM (RBF Kernel) Testing ---
Test Accuracy: 0.8598993288590604
Classification Report:
precision recall f1-score support
0 0.85 1.00 0.92 954
1 0.97 0.31 0.47 238
accuracy 0.86 1192
macro avg 0.91 0.65 0.69 1192
weighted avg 0.88 0.86 0.83 1192
Confusion Matrix:
[[952 2]
[165 73]]
--- KNN Testing ---
Test Accuracy: 0.8800335570469798
Classification Report:
precision recall f1-score support
0 0.87 1.00 0.93 954
1 0.97 0.41 0.58 238
accuracy 0.88 1192
macro avg 0.92 0.70 0.75 1192
weighted avg 0.89 0.88 0.86 1192
Confusion Matrix:
[[951 3]
[140 98]]
--- Naive Bayes Testing ---
Test Accuracy: 0.8062080536912751
Classification Report:
precision recall f1-score support
0 0.86 0.91 0.88 954
1 0.52 0.40 0.45 238
accuracy 0.81 1192
macro avg 0.69 0.65 0.67 1192
weighted avg 0.79 0.81 0.80 1192
Confusion Matrix:
[[866 88]
[143 95]]
--- XGBoost Testing ---
Test Accuracy: 0.8968120805369127
Classification Report:
precision recall f1-score support
0 0.90 0.98 0.94 954
1 0.89 0.55 0.68 238
accuracy 0.90 1192
macro avg 0.89 0.77 0.81 1192
weighted avg 0.90 0.90 0.89 1192
Confusion Matrix:
[[937 17]
[106 132]]
Summary of Test Performance:
Model Test Accuracy Precision (Weighted) \
1 Random Forest 0.906879 0.913876
6 XGBoost 0.896812 0.895881
2 Gradient Boosting 0.880872 0.878645
4 KNN 0.880034 0.891368
3 SVM (RBF Kernel) 0.859899 0.876452
0 Logistic Regression 0.835570 0.821840
5 Naive Bayes 0.806208 0.790559
Recall (Weighted) F1-Score (Weighted)
1 0.906879 0.896130
6 0.896812 0.887246
2 0.880872 0.867299
4 0.880034 0.859811
3 0.859899 0.828933
0 0.835570 0.804392
5 0.806208 0.796264
test_results_df
| Model | Test Accuracy | Precision (Weighted) | Recall (Weighted) | F1-Score (Weighted) | |
|---|---|---|---|---|---|
| 1 | Random Forest | 0.906879 | 0.913876 | 0.906879 | 0.896130 |
| 6 | XGBoost | 0.896812 | 0.895881 | 0.896812 | 0.887246 |
| 2 | Gradient Boosting | 0.880872 | 0.878645 | 0.880872 | 0.867299 |
| 4 | KNN | 0.880034 | 0.891368 | 0.880034 | 0.859811 |
| 3 | SVM (RBF Kernel) | 0.859899 | 0.876452 | 0.859899 | 0.828933 |
| 0 | Logistic Regression | 0.835570 | 0.821840 | 0.835570 | 0.804392 |
| 5 | Naive Bayes | 0.806208 | 0.790559 | 0.806208 | 0.796264 |
Hyperparameter optimized code
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Parameter grid
rf_param_grid = {
'n_estimators': [200, 300, 500, 700],
'max_depth': [None, 10, 20, 30, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2', None],
'bootstrap': [True, False]
}
# Generate random parameter combinations
n_iter = 30
param_list = list(ParameterSampler(rf_param_grid, n_iter=n_iter, random_state=42))
best_score = -np.inf
best_params = None
print("Starting Random Forest hyperparameter tuning...\n")
# Progress bar loop
for params in tqdm(param_list, desc="Tuning Progress"):
rf_model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
scores = cross_val_score(
rf_model,
X_train_processed,
y_train_clean,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_params = params
print("\nBest Parameters:", best_params)
print("Best CV Score:", best_score)
# Train best model on full training data
rf_best = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
rf_best.fit(X_train_processed, y_train_clean)
# Evaluate on test set
y_pred_rf = rf_best.predict(X_test_processed)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:\n", cm)
plt.figure(figsize=(6,5))
sns.heatmap(
cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=np.unique(y_test),
yticklabels=np.unique(y_test)
)
plt.title("Random Forest - Test Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting Random Forest hyperparameter tuning...
Tuning Progress: 100%|██████████| 30/30 [10:36<00:00, 21.21s/it]
Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
Best CV Score: 0.91991748068158
Test Accuracy: 0.9119127516778524
Classification Report:
precision recall f1-score support
0 0.91 0.99 0.95 954
1 0.95 0.59 0.73 238
accuracy 0.91 1192
macro avg 0.93 0.79 0.84 1192
weighted avg 0.92 0.91 0.90 1192
Confusion Matrix:
[[947 7]
[ 98 140]]
feature_names = preprocessor.get_feature_names_out()
Checking the most important features that contributed to the final model of RandomForest
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': rf_best.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))
Feature Importance 9 num__DEBTINC 0.146323 6 num__CLAGE 0.118481 0 num__LOAN 0.105323 8 num__CLNO 0.096939 2 num__VALUE 0.095921 1 num__MORTDUE 0.092169 5 num__DELINQ 0.086227 3 num__YOJ 0.071277 4 num__DEROG 0.055741 7 num__NINQ 0.052061
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Parameter grid
xgb_param_grid = {
'n_estimators': [200, 300, 500],
'max_depth': [3, 5, 7, 10],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'gamma': [0, 0.1, 0.3],
'min_child_weight': [1, 3, 5]
}
# Generate random combinations
n_iter = 30
param_list = list(ParameterSampler(xgb_param_grid, n_iter=n_iter, random_state=42))
best_score = -np.inf
best_params = None
print("Starting XGBoost hyperparameter tuning...\n")
for params in tqdm(param_list, desc="Tuning Progress"):
model = XGBClassifier(
**params,
eval_metric='logloss',
random_state=42,
use_label_encoder=False,
n_jobs=-1
)
scores = cross_val_score(
model,
X_train_processed,
y_train_clean,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_params = params
print("\nBest XGB Parameters:", best_params)
print("Best XGB CV Score:", best_score)
# Train best model on full training data
xgb_best = XGBClassifier(
**best_params,
eval_metric='logloss',
random_state=42,
use_label_encoder=False,
n_jobs=-1
)
xgb_best.fit(X_train_processed, y_train_clean)
# Evaluate on test set
y_pred_xgb = xgb_best.predict(X_test_processed)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title("XGBoost - Test Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting XGBoost hyperparameter tuning...
Tuning Progress: 100%|██████████| 30/30 [00:17<00:00, 1.72it/s]
Best XGB Parameters: {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best XGB CV Score: 0.9172260413327648
Test Accuracy: 0.9077181208053692
Classification Report:
precision recall f1-score support
0 0.90 0.99 0.95 954
1 0.94 0.57 0.71 238
accuracy 0.91 1192
macro avg 0.92 0.78 0.83 1192
weighted avg 0.91 0.91 0.90 1192
from sklearn.ensemble import GradientBoostingClassifier
gb_param_grid = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 5, 7],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
}
param_list = list(ParameterSampler(gb_param_grid, n_iter=20, random_state=42))
best_score = -np.inf
best_params = None
print("Starting Gradient Boosting tuning...\n")
for params in tqdm(param_list, desc="GB Tuning"):
model = GradientBoostingClassifier(**params, random_state=42)
scores = cross_val_score(
model,
X_train_processed,
y_train_clean,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_params = params
print("\nBest GB Parameters:", best_params)
print("Best CV Score:", best_score)
gb_best = GradientBoostingClassifier(**best_params, random_state=42)
gb_best.fit(X_train_processed, y_train_clean)
y_pred = gb_best.predict(X_test_processed)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges')
plt.title("Gradient Boosting - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting Gradient Boosting tuning...
GB Tuning: 100%|██████████| 20/20 [01:22<00:00, 4.10s/it]
Best GB Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 7, 'learning_rate': 0.1}
Best CV Score: 0.9157983124946376
Test Accuracy: 0.9077181208053692
Classification Report:
precision recall f1-score support
0 0.90 0.99 0.94 954
1 0.93 0.58 0.72 238
accuracy 0.91 1192
macro avg 0.92 0.79 0.83 1192
weighted avg 0.91 0.91 0.90 1192
from sklearn.neighbors import KNeighborsClassifier
knn_param_grid = {
'n_neighbors': [3, 5, 7, 9, 15],
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
}
param_list = list(ParameterSampler(knn_param_grid, n_iter=15, random_state=42))
best_score = -np.inf
best_params = None
print("Starting KNN tuning...\n")
for params in tqdm(param_list, desc="KNN Tuning"):
model = KNeighborsClassifier(**params)
scores = cross_val_score(
model,
X_train_processed,
y_train_clean,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_params = params
print("\nBest KNN Parameters:", best_params)
print("Best CV Score:", best_score)
knn_best = KNeighborsClassifier(**best_params)
knn_best.fit(X_train_processed, y_train_clean)
y_pred = knn_best.predict(X_test_processed)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
plt.title("KNN - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting KNN tuning...
KNN Tuning: 100%|██████████| 15/15 [00:02<00:00, 7.09it/s]
Best KNN Parameters: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}
Best CV Score: 0.9115521742087347
Test Accuracy: 0.910234899328859
Classification Report:
precision recall f1-score support
0 0.90 1.00 0.95 954
1 0.99 0.55 0.71 238
accuracy 0.91 1192
macro avg 0.95 0.78 0.83 1192
weighted avg 0.92 0.91 0.90 1192
from sklearn.linear_model import LogisticRegression
log_param_grid = {
'C': [0.01, 0.1, 1, 10, 100],
'penalty': ['l2'],
'solver': ['lbfgs', 'liblinear']
}
param_list = list(ParameterSampler(log_param_grid, n_iter=10, random_state=42))
best_score = -np.inf
best_params = None
print("Starting Logistic Regression tuning...\n")
for params in tqdm(param_list, desc="LogReg Tuning"):
model = LogisticRegression(**params, max_iter=1000, random_state=42)
scores = cross_val_score(
model,
X_train_processed,
y_train_clean,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_params = params
print("\nBest Logistic Parameters:", best_params)
print("Best CV Score:", best_score)
log_best = LogisticRegression(**best_params, max_iter=1000, random_state=42)
log_best.fit(X_train_processed, y_train_clean)
y_pred = log_best.predict(X_test_processed)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.title("Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting Logistic Regression tuning...
LogReg Tuning: 100%|██████████| 10/10 [00:00<00:00, 13.75it/s]
Best Logistic Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 10}
Best CV Score: 0.8046959306886496
Test Accuracy: 0.8338926174496645
Classification Report:
precision recall f1-score support
0 0.84 0.97 0.90 954
1 0.72 0.28 0.40 238
accuracy 0.83 1192
macro avg 0.78 0.63 0.65 1192
weighted avg 0.82 0.83 0.80 1192
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# MLP Hyperparameter Grid
mlp_param_grid = {
'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
'activation': ['relu', 'tanh'],
'alpha': [0.0001, 0.001, 0.01],
'learning_rate_init': [0.001, 0.01],
'solver': ['adam']
}
param_list = list(ParameterSampler(mlp_param_grid, n_iter=10, random_state=42))
best_score = -np.inf
best_params = None
print("Starting MLP tuning...\n")
for params in tqdm(param_list, desc="MLP Tuning"):
model = MLPClassifier(**params, max_iter=500, random_state=42)
scores = cross_val_score(
model,
X_train_processed,
y_train_clean,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_params = params
print("\nBest MLP Parameters:", best_params)
print("Best CV Score:", best_score)
# Train best model
mlp_best = MLPClassifier(**best_params, max_iter=500, random_state=42)
mlp_best.fit(X_train_processed, y_train_clean)
# Test prediction
y_pred = mlp_best.predict(X_test_processed)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.title("MLP - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting MLP tuning...
MLP Tuning: 100%|██████████| 10/10 [01:39<00:00, 9.92s/it]
Best MLP Parameters: {'solver': 'adam', 'learning_rate_init': 0.001, 'hidden_layer_sizes': (100,), 'alpha': 0.0001, 'activation': 'tanh'}
Best CV Score: 0.915321407724672
Test Accuracy: 0.9026845637583892
Classification Report:
precision recall f1-score support
0 0.91 0.98 0.94 954
1 0.87 0.61 0.71 238
accuracy 0.90 1192
macro avg 0.89 0.79 0.83 1192
weighted avg 0.90 0.90 0.90 1192